########################################################
# Project:    Talking to the Populist Radical Right
# Task:       The script produces the raw correspondence
#             analysis scores for the Swedish parties
# Author:     Jan Schwalbach (21/07/2022)
########################################################

# Loading packages and data
library(quanteda.textmodels)
library(quanteda.textplots)

library(quanteda)
library(RCurl)
library(XML)
library(stringr)
library(lubridate)
library(caret)
library(e1071)
library(plyr)
library(ggplot2)
library(dplyr)
library(readtext)
library(data.table)
library(wordshoal)
library(DT)
library(xtable)
library(zoo)
library(tm)
library(ggrepel)
library(grid)
library(ngram)

load(file="Corpus_Sweden.Rdata")

corpusSW <- corpusSW[corpusSW$sessiondate <= "2018-09-09",]
corpusSW <- corpusSW[corpusSW$sessiondate >= "2010-09-19",]

### Subsetting the data set

# To get the scores for each type of debate, select one of the three options.

# 1. All speeches: No changes needed

# 2. Immigration subsetting

corpusSW1<-corpusSW[!corpusSW$migpres1!="TRUE",]
corpusSW2<-corpusSW[!corpusSW$migrationcount<=2,]
corpusSW <- rbind(corpusSW1, corpusSW2) 
corpusSW <- corpusSW[!duplicated(corpusSW), ]
corpusSW<-corpusSW[!corpusSW$migrationcount<=0,]

# 3. Education subsetting

corpusSW1<-corpusSW[!corpusSW$educationpres1!="TRUE",]
corpusSW2<-corpusSW[!corpusSW$educationcount<=2,]
corpusSW <- rbind(corpusSW1, corpusSW2) 
corpusSW <- corpusSW[!duplicated(corpusSW), ]
corpusSW<-corpusSW[!corpusSW$educationcount<=0,]

# Rearranging date variables

corpusSW$party <- gsub("FP", "L", corpusSW$party)
corpusSW$partyyear <- paste(corpusSW$party,corpusSW$year)
corpusSW$date1 <- as.numeric(corpusSW$sessiondate)
corpusSW$quarter1 <- as.numeric(corpusSW$quarter)
corpusSW$partyquarter <- paste(corpusSW$party,corpusSW$quarter1)
corpusSW$partydate <- paste(corpusSW$party,corpusSW$date1)
corpusSW$debateunique <- paste(corpusSW$date,corpusSW$debate)
corpusSW$speakerdebateunique <- paste(corpusSW$debateunique,corpusSW$speaker_raw)

# Aggregating all speeches by a party on the debate level

corpusSW$speakerdebateunique<-removePunctuation(corpusSW$speakerdebateunique)
corpusSW$debateunique<-removePunctuation(corpusSW$debateunique)
corpusSW$debateparty <- paste(corpusSW$partyquarter,corpusSW$debateunique)
corpusSW$date <- corpusSW$sessiondate 
library(plyr)
corpusnew <- ddply(corpusSW, .(debateparty), summarize,
                   date=paste(date,collapse=","),
                   partyyear=paste(year,collapse=","),
                   partyquarter=paste(partyquarter,collapse=","),
                   debateunique=paste(debateunique,collapse=","),
                   text=paste(text,collapse=","),
                   year=paste(year,collapse=","),
                   type=paste(type,collapse=","),
                   party= paste(party,collapse=","))

corpusnew$party <- gsub("\\,.*","",corpusnew$party, fixed = FALSE)
corpusnew$date <- gsub("\\,.*","",corpusnew$date, fixed = FALSE)
corpusnew$type <- gsub("\\,.*","",corpusnew$type, fixed = FALSE)
corpusnew$partyyear <- gsub("\\,.*","",corpusnew$partyyear, fixed = FALSE)
corpusnew$debateunique <- gsub("\\,.*","",corpusnew$debateunique, fixed = FALSE)
corpusnew$year <- gsub("\\,.*","",corpusnew$year, fixed = FALSE)
corpusnew$partyquarter <- gsub("\\,.*","",corpusnew$partyquarter, fixed = FALSE)
corpusnew$partyquarter <- gsub("[A-Z]* ","",corpusnew$partyquarter, fixed = FALSE)
corpusSW <- corpusnew
corpusSW$quarter <- as.yearqtr(corpusSW$date) # quarter
corpusSW <- as.data.frame(corpusSW)
corpusSW$date <- as.Date(corpusSW$date)
corpusSW$partyyear <- paste(corpusSW$party,corpusSW$partyyear)

# Keeping only debates when more than one party participated and 
# speeches with more than 50 words

corpusSW <-  corpusSW %>% group_by(debateunique) %>% filter(n()>2)
corpusSW <- as.data.frame(corpusSW)

corpusSW <- as.data.frame(corpusSW)
corpusSW$length <- str_count(corpusSW$text,'\\w+')
corpusSW<-corpusSW[corpusSW$length>=50,]

# Dividing the corpus into legislative periods

corpusSW2010 <- corpusSW[corpusSW$date >= "2010-09-19",]
corpusSW2010 <- corpusSW2010[corpusSW2010$date <= "2014-09-14",]

corpusSW2014 <- corpusSW[corpusSW$date >= "2014-09-14",]
corpusSW2014 <- corpusSW2014[corpusSW2014$date <= "2018-09-09",]

### To get the scores for all/government/opposition debates,
### select one of the three options for the respective legislative debate

corpusSW2010g <- corpusSW2010[corpusSW2010$type == "government",]
corpusSW2010o <- corpusSW2010[corpusSW2010$type == "opposition",]

corpusSW2014g <- corpusSW2014[corpusSW2014$type == "government",]
corpusSW2014o <- corpusSW2014[corpusSW2014$type == "opposition",]

corpusSWcorpus <- corpus(corpusSW2010)
corpusSWcorpus <- corpus(corpusSW2010g)
corpusSWcorpus <- corpus(corpusSW2010o)

corpusSWcorpus <- corpus(corpusSW2014)
corpusSWcorpus <- corpus(corpusSW2014g)
corpusSWcorpus <- corpus(corpusSW2014o)

# Creating a dfm (removing punctuation, number, and stopwords)

Sweden_token <- tokens(corpusSWcorpus,
                        remove_punct = T,
                        remove_symbols = T,
                        remove_numbers = T
)

Sweden_token <- tokens_select(Sweden_token, pattern = stopwords("swedish"), selection = "remove")

dfm_Sweden <- dfm(Sweden_token)


dfm_Sweden <- dfm_wordstem(dfm_Sweden, language = "swedish")
#dfm_Sweden <- dfm_trim(dfm_Sweden, min_docfreq = 1)
dfm_Sweden <- dfm_group(dfm_Sweden, groups = party)

# Running the correspondance analysis

tmod_ca <- textmodel_ca(dfm_Sweden)

dat_ca <- data.frame(dim1 = coef(tmod_ca, doc_dim = 1)$coef_document, 
                     dim2 = coef(tmod_ca, doc_dim = 2)$coef_document)

dat_ca

allvalues <- dat_ca
allvalues$party <- tmod_ca$rownames
allvalues$party <- gsub(" [0-9][0-9][0-9][0-9].*", "", allvalues$party, fixed = FALSE)  
textplot_scale1d(tmod_ca)

# Plotting the respective values for the two most important dimensions

p <- ggplot(allvalues, aes(dim1,dim2))+ scale_fill_manual(values = c("seagreen4", "blue4", "steelblue4", "steelblue1", "limegreen", "red", "yellow", "red4", "dodgerblue4")) +
  geom_point(color = 'red') +
  theme_classic(base_size = 20)+ 
  geom_label_repel(aes(label = party,
                       fill = party), color = 'white',
                   size = 5) + theme(legend.position = "none")+ labs(title = "Debates Sweden", y="Dimension 2", x = "Dimension 1")
p

### Creating data sets for the analysis for:

# all debates

CA_SW_all <- allvalues
names(CA_SW_all)[1] <- "CA_SW_all_D1"
names(CA_SW_all)[2] <- "CA_SW_all_D2"

CA_SW_all_gov <- allvalues
names(CA_SW_all_gov)[1] <- "CA_SW_all_gov_D1"
names(CA_SW_all_gov)[2] <- "CA_SW_all_gov_D2"

CA_SW_all_op <- allvalues
names(CA_SW_all_op)[1] <- "CA_SW_all_op_D1"
names(CA_SW_all_op)[2] <- "CA_SW_all_op_D2"

Positions_CA_SW_ALL <- cbind(CA_SW_all,CA_SW_all_gov,CA_SW_all_op)
save(Positions_CA_SW_ALL, file = "CA_SW_ALL.Rdata") # save corpus
save(Positions_CA_SW_ALL, file = "CA_SW_ALL_14.Rdata") # save corpus

# Immigration debates

CA_SW_MIG <- allvalues
names(CA_SW_MIG)[1] <- "CA_SW_MIG_D1"
names(CA_SW_MIG)[2] <- "CA_sW_MIG_D2"

CA_SW_MIG_gov <- allvalues
names(CA_SW_MIG_gov)[1] <- "CA_SW_MIG_gov_D1"
names(CA_SW_MIG_gov)[2] <- "CA_SW_MIG_gov_D2"

CA_SW_MIG_op <- allvalues
names(CA_SW_MIG_op)[1] <- "CA_SW_MIG_op_D1"
names(CA_SW_MIG_op)[2] <- "CA_SW_MIG_op_D2"

Positions_CA_SW_MIG <- cbind(CA_SW_MIG,CA_SW_MIG_gov,CA_SW_MIG_op)
save(Positions_CA_SW_MIG, file = "CA_SW_MIG.Rdata") # save corpus
save(Positions_CA_SW_MIG, file = "CA_SW_MIG_14.Rdata") # save corpus

# Education debates

CA_SW_ED <- allvalues
names(CA_SW_ED)[1] <- "CA_SW_ED_D1"
names(CA_SW_ED)[2] <- "CA_SW_ED_D2"

CA_SW_ED_gov <- allvalues
names(CA_SW_ED_gov)[1] <- "CA_SW_ED_gov_D1"
names(CA_SW_ED_gov)[2] <- "CA_SW_ED_gov_D2"

CA_SW_ED_op <- allvalues
names(CA_SW_ED_op)[1] <- "CA_SW_ED_op_D1"
names(CA_SW_ED_op)[2] <- "CA_SW_ED_op_D2"

Positions_CA_SW_ED <- cbind(CA_SW_ED,CA_SW_ED_gov,CA_SW_ED_op)
save(Positions_CA_SW_ED, file = "CA_SW_ED.Rdata") # save corpus
save(Positions_CA_SW_ED, file = "CA_SW_ED_14.Rdata") # save corpus
